Analysis info and important parameters

rsfMRI data Parameters:

  • rsfMRI: R1S1 mr_pcorr.txt file

  • Z norm before load: False

  • upsampling: UP

ML Parameters:

  • Z norm in glmnet: True

  • W: constant declarative=0.5, procedural=0.5 (because of UP sampling) normalized maxLL_difference (min max normalization)

  • Hyperparameter tuning: nfolds = length(Y)

  • Model evaluation: nfolds = 20

Nested CV Parameters

  • Betas: |mean beta| > 0.05

  • Lambda: 1se min

iGraph Parameters

  • connectom betas: NOT nested beta
  • connection counts (duplicates): YES

Load subject data

power2011 <- read_csv("../behavior/power_2011.csv", 
                      col_types = cols(ROI=col_double(),
                                       X = col_double(),
                                       Y = col_double(),
                                       Z = col_double(),
                                       Network = col_double(),
                                       Color = col_character(),
                                       NetworkName = col_character())) %>%
  dplyr::select(ROI, X, Y, Z, Network, Color, NetworkName)

Create the Group-Level Regressor Matrix \(X\)

We now need to load the group level data. In essence, to corresponds to create a matrix X in which every individual is a row and every columns is a different ROI-to-ROI connection.

SKIP <- TRUE
if (SKIP){
  load("./__cache__/C_UP.RData")
  # color setup
  colors <- factor(power2011$Color)
  levels(colors) <-  c(brewer.pal(name="Set3", n = 12), brewer.pal(name="Set1", n = 9))[1:14]
  power2011$Color <- as.character(colors)
  #df.UP <- read_csv("./__cache__/REST1_SES01_MR_PCORR_UP_2022FEB_subj.csv")
  #C <- dfX.UP %>% dplyr::select(-HCPID) %>% as.matrix()
  C <- C2
  C <- apply(C, 2, FUN=as.numeric)
  n <- dim(C)[[1]]
} else {
  NOFLY <- c()
  SUBJS <- c()
  cols <- outer(power2011$ROI, power2011$ROI, function(x, y) {paste(x, y, sep="-")})
  cols %<>% as.vector
  
  connection <- function(x, y) {
    paste(min(x, y), max(x, y), sep="-")
  }
  
  vconnection <- Vectorize(connection)
  
  Mode <- function(x, na.rm=F) {
    if (na.rm) {
      x = x[!is.na(x)]
    }
    ux <- unique(x)
    return(ux[which.max(tabulate(match(x, ux)))])
  }
  
  reduced_power2011 <- power2011 %>% 
    dplyr::select(Network, NetworkName) %>%
    group_by(Network) %>%
    summarize(Network = mean(Network), NetworkName = Mode(NetworkName))
  
  connection_name <- function(x, y) {
    first <- min(x, y)
    second <- max(x, y)
    paste(reduced_power2011 %>% filter(Network == first) %>% dplyr::select(NetworkName) ,
          reduced_power2011 %>% filter(Network == second) %>% dplyr::select(NetworkName),
          sep="-")
    
  }
  
  vconnection_name <- Vectorize(connection_name)
  
  connection_name2 <- function(x, y) {
    first <- min(x, y)
    second <- max(x, y)
    paste(reduced_power2011$NetworkName[reduced_power2011$Network == first],
          reduced_power2011$NetworkName[reduced_power2011$Network == second],
          sep="-")
    
  }
  
  vconnection_name2 <- Vectorize(connection_name2)
  
  
  nets <- outer(power2011$Network, power2011$Network, vconnection)
  nets %<>% as.vector
  netnames <- outer(power2011$Network, power2011$Network, vconnection_name2)
  netnames %<>% as.vector
  
  
  n <- length(grep("sub-*", dir("./connectivity_matrix/REST1")))
  C <- matrix(data = rep(0, length(cols)*n), nrow =  n)
  
  j <- 1
  
  R <- NULL
  PR <- NULL
  
  for (sub in dir("./connectivity_matrix/REST1")[grep("sub-*", dir("./connectivity_matrix/REST1"))]) {
    SUBJS %<>% c(strsplit(sub, "-")[[1]][2])
    M <- paste("./connectivity_matrix/REST1", 
               sub, 
               "ses-01/mr_pcorr.txt", sep="/") %>%
      read_csv(skip = 1,
               col_names = F,
               col_types = cols(
                 .default = col_double(),
                 X1 = col_character()
               )) %>%
      as.matrix() 
    v <- as_vector(M[,2:265])  # v spreads M column-wise. M is symmetrical, so it should not matter, but better not risk it
    C[j,] <- v
    if (length(v[is.na(v)]) > 0) {
      print(paste("NA detected in sub", sub))
      NOFLY %<>% c(sub)  # Addes sub to NOFLY list
    }
    
    j <- j + 1
  }
  C <- apply(C, 2, FUN=as.numeric)
}

Define the Networks

NOI <- c(
  "Uncertain",
  "Sensory/somatomotor Hand",
  "Sensory/somatomotor Mouth",
  "Cingulo-opercular Task Control",
  "Auditory",
  "Default mode",
  "Memory retrieval?",
  "Ventral attention",
  "Visual",
  "Fronto-parietal Task Control",
  "Salience",
  "Subcortical",
  "Cerebellar",
  "Dorsal attention"
)

COI <- outer(NOI, 
             NOI, 
             function(x, y) {paste(x, y, sep="-")}) %>% as.vector()

The first censor vector simply removes the redundant columns (since the connectivity from A to B is the same as the connectivity of B to A) and the self-correlations:

censor <- outer(power2011$ROI, 
                power2011$ROI, 
                function(x, y) {x < y}) %>% as.vector()

The second censor vector removes unlikely functional connections: Those with a partial correlation value \(|r| < 0.05|\).

censor2 <- colMeans(C) %>% abs() > 0.05

Now, we combine the censor vectors in a tibble that contains all of the relevant information about each column in C.

order <- tibble(index = 1:length(nets), 
                network = nets, 
                network_names = netnames,
                connection = cols, 
                censor=censor,
                censor2 = censor2)
order %<>% arrange(network)

And we remove all entries for each a censor vector is FALSE (we also create a grouping factor G, in case in the future we want to use Group Lasso).

I <- order %>%
  filter(censor == TRUE) %>%
  filter(censor2 == TRUE) %>%
  filter(network_names %in% COI) %>%
  dplyr::select(index) 

G <- order %>%
  filter(censor == TRUE) %>%
  filter(network_names %in% COI) %>%
  dplyr::select(network) 
# G is the real grouping factor for Lasso!

As a last step, we create the “real” regressor matrix \(X\), which is the proper subset of \(C\) after removing all of the censored columns. Also, we need to load the dependent variable. In this case, it is a binary variable that determines which strategy model best fits the behavioral data of an individual, whether it is the “memory” strategy (\(Y = 1\)) or the “procedural” strategy (\(Y = 2\)).

X <- C[,as_vector(I)]
dvs <- read_csv("../actr-models/model_output/max_loglikelihood.csv",
                col_types = cols(
                  .default = col_double(),
                  HCPID = col_character(),
                  best_model = col_character(), 
                  maxLL_diff = col_double()
                )) %>% 
  dplyr::select(HCPID, best_model, maxLL_diff) %>%
  arrange(HCPID)

Now we select only the rows of \(X\) and the values of \(Y\) for which we have both rsfMRI and model data.

The dimension of X is: 230, 640

Y <- dfY.UP$best_model1

Finally, we transform the dependent variable \(Y\) into a binary numeric variable with values \((0, 1)\), so that we can use logistic regression.

Y <- as.numeric(as.factor(Y)) - 1

Quality and Characteristics of \(X\) and \(Y\)

Let’s do some visualization and analysis of our indepedenent and dependet variables, just to ensure there are no obvious problems.

Collinearity of Connectivity Regressors \(X\)

The regressors \(X\) is certainly multi-collinear; that is a consequence of having a large number of predictors \(p > n\), which, in turn, is one of the reasons why we are using Lasso. Too much collinearity, however, could be really bad and push Lasso towards selecting non-optimal regressors. To gather a sense of how much collinearity we have, we can plot the distribution of correlations among regressors:

corX <- cor(X)
distCor <- as_vector(corX[lower.tri(corX, diag = F)])
distTibble <- as_tibble(data.frame(R=distCor))

ggplot(distTibble, aes(x=R)) +
  geom_histogram(col="white", alpha=0.5, binwidth = 0.05) +
  theme_pander() +
  ylab("Number of Correlations") +
  xlab("Correlation Value") +
  ggtitle("Distribution of Correlation Values Between Regressors")

All in all, the collinearity is not that bad—all regressors are correlated at \(|r| < 0.25\), and most of them are correlated at \(|r| < 0.1\), with a peak at \(r = 0\).

Distribution of Classes

And now, let’s visualize the histogram of the dependent variable we are trying to predict:

dependent <- as_tibble(data.frame(strategy=dvs$best_model))

ggplot(dependent, aes(x = factor(strategy), fill=factor(strategy))) +
  geom_bar(col="white", alpha=0.5, width = .5) +
  scale_fill_nejm() +
  xlab("Strategy") +
  ylab("Number of Participants") +
  scale_x_discrete(labels=c( "m1" = "Declarative",  "m2" = "Procedural")) +
  ggtitle("Distribution of Strategy Use") +
  theme_pander() +
  theme(legend.position = "none")

Because the classes are not equally distributed, and participants are more likely to use the memory strategy (\(Y=0\)) than the procedural one (\(Y = 1\)), we would need to adjust the weights of our Lasso model.

Machine-Learning with Lasso

Cross-Validation

We can now define the lasso model. We will use the elastic net approach with \(\alpha = 0\) to generate a pure lasso model. The model will use a binomial (i.e., logistic) regression and will measure the cross-validation error as class misalignment.

To analyze the data, we will use Lasso, a statistical learning system based on penalized regression.

Most of the entries in our \(Y\) vector are coded as “0” (i.e., most participants use the memory strategy), which creates an imbalance. We are going to create an appropriate set of weights so that the two classes are balanced.

New: Instead of using proportion as weights, we apply maxLL_difference as weight to increase importannce of individual data who has larger maxLL_difference.

# based on proportion of two labels
W <- Y
W[W == 0] <- mean(Y)
W[W == 1] <- (1-mean(Y))

# normalize maxLL diff
min_max_norm <- function(x) {
    (x - min(x)) / (max(x) - min(x))
}


# based on maxLL
maxLL_diff = dfY.UP %>% left_join(dvs, on="HCPID") %>%
  mutate(maxLL_diff_norm = min_max_norm(maxLL_diff))
W <- maxLL_diff$maxLL_diff_norm
fit <- glmnet(y = Y,
              x = X,
              alpha=1,
              weights = W,
              family = "binomial",
              type.measure = "class",
              standardize = T
)

To choose the optimal value of \(\lambda\) in Lasso, we will examine the cross-validation error during a LOO cross-validation.

fit.cv <- cv.glmnet(y = Y,
                    x = X,
                    alpha=1,
                    family = "binomial",
                    weights = W,
                    type.measure = "class",
                    standardize=T,
                    nfolds=length(Y),
                    grouped = F
)

Now, let’s look at the cross-validation error profile.

plot(fit.cv)

The profile has the characteristic U-shape or increasing curve, with more error as \(\lambda\) increases. As recommended by Tibishirani, we will select the “lambda 1SE” value, which is the largest \(\lambda\) value that does not differ by more tha 1 SE from the \(\lambda\) value that gives us the minimum cross validation error. This guarantees the maximum generalizability.

We can also visualize the profile of the beta weights of each connection for different values of \(\lambda\).

plot(fit, sub="Beta Values for Connectivity") 
L1norm <- sum(abs(fit$beta[,which(fit$lambda==fit.cv$lambda.min)]))
abline(v=L1norm, lwd=2, lty=2)

And now, plot prettier version

lasso_df <- as_tibble(data.frame(lambda=fit.cv$lambda, 
                                 error=fit.cv$cvm, 
                                 sd=fit.cv$cvsd))

ggplot(lasso_df, aes(x=lambda, y=error)) +
  geom_line(aes(col=error), lwd=2) +
  scale_color_viridis("Error", option = "plasma") +
  geom_ribbon(aes(ymin=error -sd, ymax=error + sd), alpha=0.2,fill="blue") +
  xlab(expression(lambda)) +
  ylab("Cross-Validation Error") +
  ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
  geom_vline(xintercept = lasso_df$lambda[lasso_df$error==min(lasso_df$error)]) +
  theme_pander() +
  theme(legend.position="right")

The min \(\lambda_{min}\) is 0.0145833, The 1se min \(\lambda_{min}\) is 0.1077812

Examining the Predictive Connectome

Let’s have a better look at the relevant connections that survive the Lass penalty at the value of \(\lambda_{min} + 1 SE\). We start by saving these connections in a tibble, and saving the data on a file for later use.

#betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.1se)]
betas <- fit$beta[, which(fit$lambda==fit.cv$lambda.min)]
conn_betas <- as_tibble(data.frame(index=I$index, Beta=betas))
connectome <- order %>%
  filter(index %in% I$index) %>%
  inner_join(conn_betas) %>%
  dplyr::select(-censor2) %>%
  filter(Beta != 0) %>%
  
  # reformat connectome
  separate(connection, c("connection1", "connection2"))%>%
  separate(network, sep = "-", c("network1", "network2"), remove = F) %>%
  #filter(str_detect(network, pattern = "-1-")) %>%
  mutate(network1 = ifelse(str_detect(network, pattern = "-1-"), -1, network1)) %>%
  mutate(connection_type = ifelse(network1==network2, "Within", "Between")) %>%
  arrange(index)


# HARD CODE
connectome[connectome$network=="-1-5","network2"] <- "5"
connectome[connectome$network=="-1-7","network2"] <- "7"
connectome[connectome$network=="-1--1","network2"] <- "-1"
connectome[connectome$network=="-1-11","network2"] <- "11"
connectome[connectome$network=="-1-12","network2"] <- "12"
connectome[connectome$network=="-1-13","network2"] <- "13"

In sum, connectome has 67 non-zero connections, 37 positive beta, and 30 negative betas. We will use these betas for later brain connectivity analysis.

Model Evaluation

plot_prediction <- function(Y, Yp, title) {
wcomparison <- tibble(Observed = Y,
                    Predicted = Yp,
                    DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
            
wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                          "Correct", 
                                          "Misclassified")) %>% drop_na()
  
#rval <- floor(100*cor(Y, Yp))/100
aval <- round(100*nrow(dplyr::filter(wcomparison, Accuracy %in% "Correct")) / nrow(wcomparison),2)

p <- ggplot(wcomparison, aes(x=Predicted, y=Observed, 
                             col=Accuracy)) +
  geom_point(size=4, alpha=0.6, 
             position= position_jitter(height = 0.02)) +
  geom_abline(intercept = 0, slope = 1, 
              col="red",
              linetype="dashed") +
  scale_color_d3() +
  theme_pander() +

  theme(legend.position = "right") +
  guides(col=guide_legend("Classification")) +
  coord_fixed(xlim=c(0, 1), ylim=c(0, 1)) +
  annotate("text", x=0.3, y=0.7,
           label=paste("Accuracy (",
                       length(Y),
                       ") = ",
                       aval,
                       "%",
                       sep="")) +
  ylab("Observed Strategy") +
  xlab("Predicted Strategy") +
  ggtitle(paste("Predicted vs. Observation",title)) +
  theme(legend.position = "bottom")
  
  ggMarginal(p, 
             fill="grey", 
             alpha=0.75,
             type="density", #bins=13, 
             col="darkgrey",
             margins = "both")
}

plot_roc <- function(Y, Yp, title) {
  wcomparison <- tibble(Observed = Y,
                    Predicted = Yp,
                    DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
            
  wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                            "Correct", 
                                            "Misclassified")) %>% drop_na()
  wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))

  rocobj <- roc(wcomparison$Observed, wcomparison$ROCPrediction)
  
  g <- ggroc(rocobj, col="red") +
    geom_point(aes(y=rocobj$sensitivities, x=rocobj$specificities), col="red", size=4, alpha=.5) +
    ggtitle(paste("AUC ROC Curve", title, round(rocobj$auc[[1]], 2))) +
    xlab("Specificity (FPR)") + ylab("Sensitivity (TPR)") + 
    geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="grey", linetype="dashed") +
    theme_pander()
  
  g
}

plot_roc_slide <- function(Y, Yp, title) {
  wcomparison <- tibble(Observed = Y,
                    Predicted = Yp,
                    DiscretePredicted = ifelse(Yp < 0.5, 0, 1))
            
  wcomparison %<>% mutate(Accuracy = ifelse(DiscretePredicted == Observed,
                                            "Correct", 
                                            "Misclassified")) %>% drop_na()
  wcomparison %<>% mutate(ROCPrediction = if_else(Predicted < 0.5, 0, 1))
  curve <- NULL

  for (threshold in seq(0, 1, 0.01)) {
    subthreshold <- wcomparison %>%
      mutate(Prediction = ifelse(Predicted > 1, 1, Predicted)) %>%
      mutate(Prediction = ifelse(Prediction <= 0, 1e-204, Prediction)) %>%
      mutate(Prediction = ifelse(Prediction <= threshold, 0, 1)) %>%
      mutate(Accuracy = ifelse(Prediction == Observed, 1, 0)) %>%
      group_by(Observed) %>%
      summarise(Accuracy = mean(Accuracy))
    
    tnr <- subthreshold %>% 
      filter(Observed == 0) %>% 
      dplyr::select(Accuracy) %>%
      as.numeric()
    
    tpr <- subthreshold %>% 
      filter(Observed == 1) %>% 
      dplyr::select(Accuracy) %>%
      as.numeric()
    
    partial <- tibble(Threshold = threshold,
                      TNR = tnr,
                      TPR = tpr)
    if (is.null(curve)) {
      curve <- partial
    } else {
      curve <- rbind(curve, partial)
    }
  }
  
  s <- ggplot(arrange(curve, TPR), aes(x=TNR, y=TPR)) + 
    geom_point(size=2, col="red", alpha=0.5) + 
    geom_line(col="red") + 
    ylab("Sensitivity (True Positive Rate)") +
    xlab("Specificity (True Negative Rate)") +
    scale_x_reverse() +
    # ylim(0, 1) +
    # xlim(1, 0) +
    ggtitle("ROC Curve for Different Thresholds") +
    geom_abline(slope=1, intercept = 1, col="grey", linetype = "dashed") +
    theme_pander()
  s
}
# validation data misclassification error
fit.cv.accuracy <- 1-assess.glmnet(fit.cv, X, Y, 
                                   weights = W, 
                                   s="lambda.min", #"lambda.1se", 
                                   family = "binomial")$class %>% as.vector()# best lambda cv error


fit.cv.auc <- assess.glmnet(fit.cv, X, Y, 
                            weights = W, 
                            s="lambda.min", 
                            family = "binomial")$auc %>% as.vector()# best lambda cv error
  
# training data prediction probabilities
fit.cv.pred <- predict(fit.cv, newx = X, 
                       weights = W, 
                       s="lambda.min", 
                       type="class", 
                       family = "binomial")%>% as.numeric()


fit.cv.predprob <- predict(fit.cv, 
                           newx = X, 
                           weights = W, 
                           s="lambda.min",
                           type="response", family = "binomial")%>% as.numeric()

Calculate training Accuracy score (0.9958241) and AUC score (0.9995458)

Predicted vs. Observed

plot_prediction(Y, fit.cv.predprob, "(Training)")

ROC

plot_roc(Y, fit.cv.predprob, "Training")

ROC Curve By Sliding Threshold

plot_roc_slide(Y, fit.cv.predprob, "Training")

Finally, we can visualize the table of connections

connectome %>%
  xtable() %>%
  kable(digits = 5) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
index network network1 network2 network_names connection1 connection2 censor Beta connection_type
265 -1–1 -1 -1 Uncertain-Uncertain 1 2 TRUE 0.26856 Between
3975 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 15 16 TRUE 4.43556 Within
4505 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 17 18 TRUE 0.93616 Within
6887 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 23 27 TRUE -1.17579 Within
7147 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 19 28 TRUE -1.01809 Within
7416 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 24 29 TRUE -0.77276 Within
8994 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 18 35 TRUE -9.74230 Within
9803 1-1 1 1 Sensory/somatomotor Hand-Sensory/somatomotor Hand 35 38 TRUE -5.05733 Within
12971 1-3 1 3 Sensory/somatomotor Hand-Cingulo-opercular Task Control 35 50 TRUE -1.85842 Between
18271 3-4 3 4 Cingulo-opercular Task Control-Auditory 55 70 TRUE -4.25701 Between
19805 -1-5 -1 5 Uncertain-Default mode 5 76 TRUE -5.67856 Between
22519 5-5 5 5 Default mode-Default mode 79 86 TRUE 3.26786 Within
24368 5-5 5 5 Default mode-Default mode 80 93 TRUE -6.80746 Within
24639 5-5 5 5 Default mode-Default mode 87 94 TRUE -2.23742 Within
24896 5-5 5 5 Default mode-Default mode 80 95 TRUE -5.43170 Within
26765 5-5 5 5 Default mode-Default mode 101 102 TRUE -0.14865 Within
27293 5-5 5 5 Default mode-Default mode 101 104 TRUE 2.12650 Within
30202 5-5 5 5 Default mode-Default mode 106 115 TRUE 0.64052 Within
30209 5-5 5 5 Default mode-Default mode 113 115 TRUE 2.31136 Within
31161 -1-5 -1 5 Uncertain-Default mode 9 119 TRUE 0.16959 Between
31420 -1-5 -1 5 Uncertain-Default mode 4 120 TRUE 1.84038 Between
32057 5-5 5 5 Default mode-Default mode 113 122 TRUE -9.58421 Within
34152 5-5 5 5 Default mode-Default mode 96 130 TRUE -1.65609 Within
35205 5-6 5 6 Default mode-Memory retrieval? 93 134 TRUE 1.78707 Between
35465 5-6 5 6 Default mode-Memory retrieval? 89 135 TRUE 3.92882 Between
37364 -1–1 -1 -1 Uncertain-Uncertain 140 142 TRUE 1.33446 Between
39479 7-7 7 7 Visual-Visual 143 150 TRUE 6.00812 Within
41264 5-7 5 7 Default mode-Visual 80 157 TRUE 0.53693 Between
41860 7-7 7 7 Visual-Visual 148 159 TRUE 0.85506 Within
43436 -1-7 -1 7 Uncertain-Visual 140 165 TRUE 1.27352 Between
43446 7-7 7 7 Visual-Visual 150 165 TRUE -1.87671 Within
44496 7-7 7 7 Visual-Visual 144 169 TRUE 5.70150 Within
46905 8-8 8 8 Fronto-parietal Task Control-Fronto-parietal Task Control 177 178 TRUE 1.92888 Within
47145 7-8 7 8 Visual-Fronto-parietal Task Control 153 179 TRUE 4.76290 Between
47964 -1-8 -1 1 Uncertain-Fronto-parietal Task Control 180 182 TRUE 2.13770 Between
48703 -1-5 -1 5 Uncertain-Default mode 127 185 TRUE 1.05910 Between
49015 8-8 8 8 Fronto-parietal Task Control-Fronto-parietal Task Control 175 186 TRUE 2.40309 Within
51393 8-8 8 8 Fronto-parietal Task Control-Fronto-parietal Task Control 177 195 TRUE 5.16809 Within
52205 8-8 8 8 Fronto-parietal Task Control-Fronto-parietal Task Control 197 198 TRUE 2.10510 Within
53422 5-9 5 9 Default mode-Salience 94 203 TRUE -0.94006 Between
53885 1-9 1 9 Sensory/somatomotor Hand-Salience 29 205 TRUE -5.23179 Between
54042 8-9 8 9 Fronto-parietal Task Control-Salience 186 205 TRUE 0.90361 Between
55525 -1-9 -1 1 Uncertain-Salience 85 211 TRUE -2.12627 Between
57146 5-9 5 9 Default mode-Salience 122 217 TRUE 3.09114 Between
57658 5-9 5 9 Default mode-Salience 106 219 TRUE -3.35738 Between
57770 9-9 9 9 Salience-Salience 218 219 TRUE 4.09291 Within
61745 10-10 10 10 Subcortical-Subcortical 233 234 TRUE -6.11435 Within
61853 5-11 5 11 Default mode-Ventral attention 77 235 TRUE -2.32695 Between
62158 5-11 5 11 Default mode-Ventral attention 118 236 TRUE 7.51715 Between
63560 8-11 8 11 Fronto-parietal Task Control-Ventral attention 200 241 TRUE -2.05476 Between
63762 11-11 11 11 Ventral attention-Ventral attention 138 242 TRUE 1.40898 Within
63865 11-11 11 11 Ventral attention-Ventral attention 241 242 TRUE -0.03107 Within
64659 13-13 13 13 Cerebellar-Cerebellar 243 245 TRUE 13.93071 Within
64814 6-13 6 13 Memory retrieval?-Cerebellar 134 246 TRUE 2.31873 Between
66338 5-12 5 12 Default mode-Dorsal attention 74 252 TRUE -3.44244 Between
66780 -1-12 -1 12 Uncertain-Dorsal attention 252 253 TRUE -4.83619 Between
67455 6-12 6 12 Memory retrieval?-Dorsal attention 135 256 TRUE 1.98378 Between
67477 7-12 7 12 Visual-Dorsal attention 157 256 TRUE 0.37075 Between
67728 7-12 7 12 Visual-Dorsal attention 144 257 TRUE -0.72342 Between
67748 7-12 7 12 Visual-Dorsal attention 164 257 TRUE -0.38728 Between
68005 7-12 7 12 Visual-Dorsal attention 157 258 TRUE 2.82603 Between
68099 12-12 12 12 Dorsal attention-Dorsal attention 251 258 TRUE 0.12297 Within
68142 1-12 1 12 Sensory/somatomotor Hand-Dorsal attention 30 259 TRUE 2.01534 Between
68450 5-12 5 12 Default mode-Dorsal attention 74 260 TRUE -2.84148 Between
68463 5-12 5 12 Default mode-Dorsal attention 87 260 TRUE 0.43389 Between
69206 1-12 1 12 Sensory/somatomotor Hand-Dorsal attention 38 263 TRUE -3.59807 Between
69461 1-12 1 12 Sensory/somatomotor Hand-Dorsal attention 29 264 TRUE -8.53385 Between

Stability of Estimated Beta Weights

And now, let’s visualize the beta weights of the connections: num connections=67

ggplot(connectome, aes(x = reorder(connection, Beta), y = Beta)) +
  geom_point(aes(col=Beta), alpha=.5, 
             size=2,
             position = position_jitter(height = 0, width = 0.3)) +
  stat_summary(fun.data = "mean_sdl", geom="point", fill="black", alpha=1, size=1) +
  scale_color_gradient2(low = "dodgerblue",
                        mid = "wheat",
                        high = "red2",
                        midpoint = 0) +
  scale_x_discrete(labels = 
                     paste(connectome$network_names, 
                           " (", 
                           connectome$connection,
                           ")", sep="")) +
  ggtitle(paste("Connection Weights Across Cross-Validation:", dim(connectome)[[1]])) +
  ylab(expression(paste(beta, " value"))) +
  xlab("Connection") +
  geom_hline(yintercept = 0, col="grey") +
  stat_summary(fun.data = "mean_cl_boot", 
               col="black", geom="errorbar", width=1) +
  scale_color_viridis(option="plasma", begin=0.2, end=0.9) +
  theme_pander() +
  theme(axis.text.y = element_text(angle=0, hjust=1),
        legend.position = "NA") +
  #ylim(-3, 3) +
  coord_flip()
connectome %>% 
  mutate(beta_sign = ifelse(Beta >0, "+", "-")) %>%
  ggdotchart(x = "network_names", y = "Beta",
           color = "beta_sign",                                # Color by groups
           palette = c("steelblue", "tomato"), # Custom color palette
           rotate = TRUE,
           facet.by = "connection_type", 
           sort.by.groups = F,
           sort.val = "desc",          # Sort the value in descending order
           sorting = "descending",                       # Sort value in descending order
           add = "segments",                             # Add segments from y = 0 to dots
           add.params = list(color = "lightgray", size = 2), # Change segment color and size
           group = "connection_type",                                # Order by groups
           dot.size = 3,                                 # Large dot size
           #label = round(connectome$Beta,2),                        # Add mpg values as dot labels
           #font.label = list(color = "white", size = 9,
           #                  vjust = 0.5),               # Adjust label parameters
           #group = "cyl",
           #y.text.col = TRUE,
           title = paste("Lasso Connection Weights:", dim(connectome)[[1]]),
           ggtheme = theme_pander()) +
  geom_hline(yintercept = 0, linetype = 2, color = "black") 

Statistical analysis of Network Distribution

Lasso vs. Power

subsetPower <- filter(power2011,
                      NetworkName %in% NOI)
noi_stats <- subsetPower %>%
  group_by(NetworkName, Color) %>%
  summarise(N=length(Color)/dim(subsetPower)[1]) %>%
  add_column(Source="Power")

lROIs <- unique(c(connectome$connection1, connectome$connection2))

rois <- power2011[lROIs,]
roi_stats <- rois %>%
  group_by(NetworkName, Color, .drop = F) %>%
  summarise(N=length(Color)/length(lROIs)) %>%
  add_column(Source="Lasso") 


roi_stats <- roi_stats %>% 
  right_join(noi_stats %>% dplyr::select(NetworkName, Color), on = c("NetworkName", "Color")) %>%
  mutate(N = ifelse(is.na(N), 0, N), Source="Lasso") %>%
  arrange(NetworkName)

total_stats <- rbind(noi_stats, roi_stats)
ggplot(total_stats, aes(x="", y=N, fill=NetworkName)) +
  geom_bar(stat = "identity", col="white", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  scale_fill_manual(values = unique(power2011$Color)) +
  #scale_fill_viridis(discrete = T) +
  #scale_fill_ucscgb() +
  ylab("") +
  xlab("") +
  ggtitle("Distriution of ROI") +
  geom_text_repel(aes(label=percent(N, .1)), col="black", 
            position=position_stack(vjust=.01), size=3)+
  theme_pander() +
  guides(fill=guide_legend(ncol=2)) +
  theme(legend.position = "bottom")

#ggbarplot(total_stats, x="NetworkName", y="N", facet.by = "Source", fill = "NetworkName", color = "white",
#          merge = T, label = F,
#          ) +
#  coord_polar("y", start=0) 
chisq.test(roi_stats$N*length(lROIs), p = noi_stats$N)
## 
##  Chi-squared test for given probabilities
## 
## data:  roi_stats$N * length(lROIs)
## X-squared = 17.765, df = 13, p-value = 0.1667

Lasso vs. Power:

Between vs. Within

net_from <- function(s) {as.character(strsplit(s, "-")[[1]][1])}
net_to <- function(s) {as.character(strsplit(s, "-")[[1]][2])}

vnet_from <- Vectorize(net_from)
vnet_to <- Vectorize(net_to)

connectivity <- function(s) {
  if (net_from(s) == net_to(s)) {
    "Within"
  } else {
    "Between"
  }
}

vconnectivity <- Vectorize(connectivity)
coi <- order %>%
  filter(censor == TRUE) %>%
  filter(network_names %in% COI) 

coi$from <- vnet_from(coi$network_names)
coi$to <- vnet_to(coi$network_names)
coi$connection_type <- vconnectivity(coi$network_names)

coi_stats <- coi %>% 
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(coi)[1]) %>%
  add_column(Source = "Power et al. (2011)")
connectome$connection_type <- vconnectivity(connectome$network_names)
connectome_stats <- connectome %>%
  group_by(connection_type) %>% 
  summarise(N=length(index), P=length(index)/dim(connectome)[1]) %>%
  add_column(Source = "Lasso Analysis")

connect_stats <- rbind(connectome_stats, coi_stats)
ggplot(connect_stats, aes(x="", y=P, fill=connection_type)) +
  geom_bar(stat = "identity", col="grey", width=1) +
  facet_grid(~Source, labeller = label_both) +
  coord_polar("y", start=0) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 2L)) +
  scale_fill_jama() +
  scale_color_jama() +
  ylab("") +
  xlab("") +
  ggtitle("Distribuction of Connectivity\n(Within/Between Networks)") +
  geom_text_repel(aes(label=percent(P, .1)), col="white",
            position=position_stack(vjust=1), size=3)+
  theme_pander() +
  theme(legend.position = "bottom")

chisq.test(connectome_stats$N, p=coi_stats$P)
## 
##  Chi-squared test for given probabilities
## 
## data:  connectome_stats$N
## X-squared = 86.667, df = 1, p-value < 2.2e-16

Nested Cross-Validation

NOTE: In this case, lambda is not same for each subject

N <- length(Y)
P <- ncol(X)
betas <- matrix(rep(0, P * N), nrow = N)
Yp <- rep(0, N)
minF = 1
#X <- atanh(X)  ## ??? WHY USE atanh
dfX <- as.data.frame(cbind(Y, X))
for (i in 1:N) {
  Ytrain <- Y[-i]
  Xtrain <- X[-i,]
  Wtrain <- W[-i]
  # fit <- ncvreg
  fit <- glmnet(y = Ytrain,
                x = Xtrain,
                weights = Wtrain,
                alpha = 1,
                family = "binomial",
                type.measure ="class",
                standardize = T
  )
  
  fit.cv <- cv.glmnet(y = Ytrain,
                      x = Xtrain,
                      alpha=1,
                      weights=Wtrain,
                      #penalty="SCAD",
                      family = "binomial",
                      type.measure = "class",
                      standardize=T,
                      grouped=F,
                      nfolds=20
                      #nfolds=length(Ytrain)
  )
  
  
  lambda <- fit.cv$lambda.min
  nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.min)]
  
  if (fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.min)] > 0) {
    lambda <- fit.cv$lambda.min
    nzero <- fit.cv$nzero[which(fit.cv$lambda == fit.cv$lambda.min)]
  }
  
  if (nzero < minF) {
    # If no features, select a less-generalizable lambda
    lambda <- fit.cv$lambda[which(fit.cv$nzero >= minF)[1]]
  } 
  
  B <- fit$beta[,which(fit$lambda==lambda)]
  #B <- coef(fit.cv, s=lambda) %>% as.matrix()
  #B <- fit$beta[,which(fit$lambda==fit$lambda[60])]
  #print(B)
  #print(fit.cv$lambda.min)
  #plot(fit.cv)
  if (length(B[B!=0])) {
    #print(c(i, length(B[B!=0])))
    dfX <- data.frame(cbind(Y, X[, B != 0]))
    #lmod<-lm(Y ~ . + 1, data=dfX[-i,])
    #print(lmod)
    #Xtest <- dfX[i,-1]
    #yp <- lmod$coefficients[1] + sum(B*X[i,])# predict on test data
    yp <- predict(fit.cv, newx = X[-i,], s=lambda, type="response")
    assess.glmnet(fit.cv, X[-i,], Y[-i], weights = W[-i], s=fit.cv$lambda.min, family = "binomial")$class # best lambda cv error
    assess.glmnet(fit.cv, X[-i,], Y[-i], weights = W[-i], s=fit.cv$lambda.min, family = "binomial")$auc # best lambda cv auc
    assess.glmnet(fit, X[-i,], Y[-i], weights = W[-i], s=fit.cv$lambda.min, family = "binomial")$class # best lambda cv error = same as first
    
  } else {
    yp <- mean(Ytrain)
  }
  betas[i,] <- B
  Yp[i] <- yp
}
set.seed(0)
#dfX <- data.frame(cbind(Y, X[,betas != 0]))
nrounds <- 200
nfolds <- 20
ntest <- 30
nP <- ncol(X)
nO <- nrow(X)
# Training log0
Yp.train <- matrix(rep(NA, nO * nO),  ncol = nO) %>% as.data.frame()  # Vector of zeros the size of 176 x 176 

# Prediction log
nested.train.Yp <- matrix(rep(NA, nO * nrounds),  ncol = nrounds)
nested.train.Ypp <- matrix(rep(NA, nO * nrounds),  ncol = nrounds)
nested.test.Yp <- matrix(rep(NA, nO * nrounds),  ncol = nrounds)
nested.test.Ypp <- matrix(rep(NA, nO * nrounds),  ncol = nrounds)

### Score log
nested.train.errorscore <- c()
nested.train.aucscore <- c()
nested.test.errorscore <- c()
nested.test.acuscore <- c()

### Coefs log
Xcoef <- matrix(rep(NA, nP * nrounds),  ncol = nrounds) # Matrix of zeros the dimensions of X (645 x 176)

### Best Lambda log
nested.lambdas <- c()

#colnames(Xco) <- paste("s", 1:numO, sep="")
#rownames(Xco) <- paste("b", 1:numP, sep="")
for(i in 1:nrounds) {
  tryCatch({
    #print(i)
    #if (i==7) stop("Urgh, the iphone is in the blender !")
  itest <- sample(seq(length(Y)), ntest, replace=FALSE)
  #itest <- seq(i, i+2) %>% as.integer()
  #if (i+2==length(Y)) { break }
  iW <- Y[-itest]
  iW[iW == 0] <- mean(Y[-itest])
  iW[iW == 1] <- (1-mean(Y[-itest]))
  
  ilasso <- glmnet(x=X[-itest, ], y=Y[-itest], 
                   alpha=1,
                   weights = iW,
                   family = "binomial", 
                   type.measure = "class",  
                   standardize=F)
  
  ilasso.cv <- cv.glmnet(x=X[-itest, ], y=Y[-itest], 
                        alpha=1,
                        weights=iW,
                        #penalty="SCAD",
                        family = "binomial",
                        type.measure = "class",
                        standardize=T,
                        grouped=F,
                        nfolds=nfolds)
  
  # define best lambda
  bestlambda <- fit.cv$lambda.min
  nested.lambdas <- c(nested.lambdas, bestlambda)
  
  # validation data misclassification error
  ilasso.cv.error <-assess.glmnet(ilasso.cv, X[-itest,], Y[-itest], weights = W[-itest], s=bestlambda, family = "binomial")$class %>% as.vector()# best lambda cv error
  ilasso.cv.auc <-assess.glmnet(ilasso.cv, X[-itest,], Y[-itest], weights = W[-itest], s=bestlambda, family = "binomial")$auc %>% as.vector()# best lambda cv error
  # training data prediction probabilities
  ilasso.cv.pred <- predict(ilasso.cv, newx = X[-itest,], weights = W[-itest], s=bestlambda, type="class", family = "binomial")%>% as.numeric()
  ilasso.cv.predprob <- predict(ilasso.cv, newx = X[-itest,], weights = W[-itest], s=bestlambda, type="response", family = "binomial")%>% as.numeric()
  
  # testing data misclassification error
  ilasso.test.error <-assess.glmnet(ilasso.cv, X[itest,], Y[itest], weights = W[itest], s=bestlambda, family = "binomial")$class %>% as.vector()# best lambda cv error
  ilasso.test.auc <-assess.glmnet(ilasso.cv, X[itest,], Y[itest], weights = W[itest], s=bestlambda, family = "binomial")$auc %>% as.vector()# best lambda cv error
  # training data prediction probabilities
  ilasso.test.pred <- predict(ilasso.cv, newx = X[itest,], weights = W[itest], s=bestlambda, type="class", family = "binomial")%>% as.numeric()
  ilasso.test.predprob <- predict(ilasso.cv, newx = X[itest,], weights = W[itest], s=bestlambda, type="response", family = "binomial")%>% as.numeric()
  
  # coeff
  B <- coef(ilasso.cv, s=bestlambda)[-1,] # do not keep intercept
  
  ### LOG Score
  nested.train.errorscore <- c(nested.train.errorscore, ilasso.cv.error)
  nested.train.aucscore <- c(nested.train.aucscore, ilasso.cv.auc)
  nested.test.errorscore <- c(nested.test.errorscore, ilasso.test.error)
  nested.test.acuscore <- c(nested.test.acuscore, ilasso.test.auc)
  
  ### LOG Coefs
  Xcoef[,i] <- B
  
  ### LOG Prediction
  nested.train.Yp[-itest,i] <- ilasso.cv.pred
  nested.train.Ypp[-itest,i] <- ilasso.cv.predprob
  nested.test.Yp[itest,i] <- ilasso.test.pred
  nested.test.Ypp[itest,i] <- ilasso.test.predprob

  }, error=function(e){
    print(paste('i=', i, "Uhhh, error\n"))
  })
}

Visualize Prediction vs. Observed on Training and Testing data ’ The Yp is calculated by averaged across each round

Predicted vs. Observed

plot_prediction(Y, apply(nested.train.Ypp, 1, mean, na.rm=T), "(Training)")

plot_prediction(Y, apply(nested.test.Ypp, 1, mean, na.rm=T), "(Testing)")

ROC

plot_roc(Y, apply(nested.train.Ypp, 1, mean, na.rm=T), "(Training)")

plot_roc(Y, apply(nested.test.Ypp, 1, mean, na.rm=T), "(Testing)")

ROC By Sliding Threshold

plot_roc_slide(Y, apply(nested.train.Ypp, 1, mean, na.rm=T), "(Training)")

plot_roc_slide(Y, apply(nested.test.Ypp, 1, mean, na.rm=T), "(Testing)")

### LOG Score
nested.df <- data.frame(score=1-nested.train.errorscore, data_type="train", score_type="Accuracy", rounds = seq(1:nrounds)) %>%
  rbind(data.frame(score=1-nested.test.errorscore, data_type="test", score_type="Accuracy", rounds = seq(1:nrounds))) %>%
  rbind(data.frame(score=nested.train.aucscore, data_type="train", score_type="AUC", rounds = seq(1:nrounds))) %>%
  rbind(data.frame(score=nested.test.acuscore, data_type="test", score_type="AUC", rounds = seq(1:nrounds))) 
  
ggboxplot(data=nested.df, x="data_type" , y="score", 
          color = "data_type", #fill = "score_type", 
          facet.by = "score_type", add = "jitter") +
  geom_hline(yintercept = 0.5, col = "gray", line_type=1) +
  ggtitle("Nested CV: AUC and Accuracy for both Training and Testing data") +
  ylim(0,1) +
  theme_pander() 

The left skewed distribution of Betas is a good sign that betas do not change significantly across CV Folds

Xcoef.stats <- data.frame(beta.id=seq(1:dim(Xcoef)[[1]]),beta.mean=apply(Xcoef,1,mean),  beta.sd=apply(Xcoef,1,sd))%>% # 1=Row, 2=Col 
  filter(beta.mean!=0) %>%
  arrange(-beta.sd)

Xcoef.stats %>% 
  gghistogram("beta.sd", bins = 100, fill = "steelblue", color = "white") +
  ggtitle("Distribution of Coefficients SD across Nested CV-folds") +
  theme_pander()

Examining the Predictive Connectome

nested_beta_thresh <- 0.05

uB <- rowMeans(Xcoef)
conn_betas_nested <- as_tibble(data.frame(index=I$index, Beta=uB))
connectome_nested <- order %>%
  filter(index %in% I$index) %>%
  inner_join(conn_betas_nested) %>%
  dplyr::select(-censor2) %>%
  #filter(Beta != 0) %>%
  filter(Beta <= -nested_beta_thresh | Beta >= nested_beta_thresh) %>%

  # reformat connectome
  separate(connection, c("connection1", "connection2"))%>%
  separate(network, sep = "-", c("network1", "network2"), remove = F) %>%
  #filter(str_detect(network, pattern = "-1-")) %>%
  mutate(network1 = ifelse(str_detect(network, pattern = "-1-"), -1, network1)) %>%
  mutate(connection_type = ifelse(network1==network2, "Within", "Between")) %>%
  arrange(index)


# HARD CODE
connectome_nested[connectome_nested$network=="-1-5","network2"] <- "5"
connectome_nested[connectome_nested$network=="-1-7","network2"] <- "7"
connectome_nested[connectome_nested$network=="-1--1","network2"] <- "-1"
connectome_nested[connectome_nested$network=="-1-11","network2"] <- "11"
connectome_nested[connectome_nested$network=="-1-12","network2"] <- "12"
connectome_nested[connectome_nested$network=="-1-13","network2"] <- "13"

After applying threshold 0.05, the remaining number of connections (NESTED) is 216

Stability of Estimated Beta Weights

And now, let’s visualize the beta weights of the connections

ggplot(connectome_nested, aes(x = reorder(connection, Beta), y = Beta)) +
  geom_point(aes(col=Beta), alpha=.5, 
             size=2,
             position = position_jitter(height = 0, width = 0.3)) +
  stat_summary(fun.data = "mean_sdl", geom="point", fill="black", alpha=1, size=1) +
  scale_color_gradient2(low = "dodgerblue",
                        mid = "wheat",
                        high = "red2",
                        midpoint = 0) +
  scale_x_discrete(labels = 
                     paste(connectome_nested$network_names, 
                           " (", 
                           connectome_nested$connection,
                           ")", sep="")) +
  ggtitle(paste("Connection Weights Across Cross-Validation", dim(connectome_nested)[[1]])) +
  ylab(expression(paste(beta, " value"))) +
  xlab("Connection") +
  geom_hline(yintercept = 0, col="grey") +
  stat_summary(fun.data = "mean_cl_boot", 
               col="black", geom="errorbar", width=1) +
  scale_color_viridis(option="plasma", begin=0.2, end=0.9) +
  theme_pander() +
  theme(axis.text.y = element_text(angle=0, hjust=1),
        legend.position = "NA") +
  #ylim(-3, 3) +
  coord_flip()
connectome_nested %>% 
  mutate(beta_sign = ifelse(Beta >0, "+", "-")) %>%
  ggdotchart(x = "network_names", y = "Beta",
           color = "beta_sign",                                # Color by groups
           palette = c("steelblue", "tomato"), # Custom color palette
           rotate = TRUE,
           facet.by = "connection_type", 
           sort.by.groups = F,
           sort.val = "desc",          # Sort the value in descending order
           sorting = "descending",                       # Sort value in descending order
           add = "segments",                             # Add segments from y = 0 to dots
           add.params = list(color = "lightgray", size = 2), # Change segment color and size
           group = "connection_type",                                # Order by groups
           dot.size = 3,                                 # Large dot size
           #label = round(connectome$Beta,2),                        # Add mpg values as dot labels
           #font.label = list(color = "white", size = 9,
           #                  vjust = 0.5),               # Adjust label parameters
           #group = "cyl",
           #y.text.col = TRUE,
           title = paste("Lasso Connection Weights(Nested):", dim(connectome_nested)[[1]]),
           ggtheme = theme_pander()) +
  geom_hline(yintercept = 0, linetype = 2, color = "black") 

Testing the validity of the Lasso model

Here, we will examine the quality of our Lasso model bu doing a series of tests.

Ablation test

In the ablation test, we remove all the connections with significant beta values, and check whether the results are still significant.

XX <- X[, conn_betas$Beta == 0]

fit_wo <- glmnet(y = Y,
                 x = XX,
                 alpha=1,
                 lambda = fit$lambda,
                 family = "binomial",
                 type.measure = "class",
                 weights = W,
                 standardize = T
)

fit_wo.cv <- cv.glmnet(y = Y,
                       x = XX,
                       alpha=1,
                       weights = W,
                       lambda = fit$lambda,
                       standardize=T,
                       type.measure = "class",
                       family = "binomial",
                       grouped=F,
                       nfolds=length(Y)
)

The model does converge, but its overall classification error is much higher.

plot(fit_wo, sub="Beta Values for Connectivity")

L1norm <- sum(abs(fit_wo$beta[,which(fit_wo$lambda==fit_wo.cv$lambda.min)]))
abline(v=L1norm, lwd=2, lty=2)

It is useful to plot the two \(\lambda\)-curves (with and without the relevant connections) on the same plot.

lasso_df_wo <- tibble(lambda=fit_wo.cv$lambda, 
                   error=fit_wo.cv$cvm, 
                   sd=fit_wo.cv$cvsd)



lasso_df$Model <- "Full Model"
lasso_df_wo$Model <- "Without the Selected Connections"

lasso_uber <- rbind(lasso_df, lasso_df_wo)

ggplot(lasso_uber, aes(x = lambda, y = error, fill=Model)) +
  #scale_color_d3() +
  #scale_fill_d3()+
  geom_ribbon(aes(ymin = error - sd, 
                  ymax = error + sd), 
              alpha = 0.5,
              #fill="blue"
              ) +
  geom_line(aes(col=Model), lwd=2) +
  xlab(expression(lambda)) +
  ylab("Cross-Validation Error") +
  ggtitle(expression(paste(bold("Cross Validation Error Across "), lambda))) +
  geom_vline(xintercept = fit.cv$lambda.min,
             linetype="dashed") +
  ylim(0,1) +
  theme_pander() +
  theme(legend.position="bottom")

Variance Inflation Factor

Then, we examine the Variance Inflation Factor (VIF). To calculate the VIF, we need to first create a linear model of the factor effects:

dfX <- data.frame(cbind(Y, X[, betas != 0]))
#dfX <- data.frame(cbind(Y, X[conn_betas[conn_betas$Beta!=0,]$index]))
mod<-lm(Y ~ . + 1, as.data.frame(dfX))

We can now calculate the VIF and turn the results into a tibble:

vifs <- vif(mod)
vifsT <- tibble(VIF = vifs)

And, finally, we can plot an histogram of the distribution of VIF values. VIFs values < 10 are considered non-collinear; VIFs values < 5 are great. All of our factors have VIF values that a re much smaller than 5, which implies that they are as close to a normal basis set as possible.

ggplot(vifsT, aes( x =VIF)) +
  geom_histogram(col="white", binwidth = 0.1, fill="blue", alpha=0.4) +
  theme_pander() +
  xlab("VIF Value") +
  ylab("Number of Predictors") +
  ggtitle("Distribution of Variance Inflation Factors")


Brain Network Analysis (Power)

A Matrix

To calculate the averaged corr matrix A

  1. find the Fisher’s Z values of the corresponding Pearson correlation coefficients
  2. Average them
  3. Find the reverse Fisher’s Z transform of that average value.
C.z <- FisherZ(C)
C.zmean <- matrix(colMeans(C.z), nrow=264, ncol = 264)
A <- FisherZInv(C.zmean)
A.vec <- as.vector(A)

W Matrix

Calculate W from betas and A Matrix

library(circlize)

connectom2matrix <- function(connectome, w) {
  empty_mat <- matrix(0, 264, 264, dimnames = list(paste0("X", 1:264), paste0("X", 1:264))) 
  empty_mat[connectome$index] <- connectome$W
  return(empty_mat)
}

# convert a 264*264 matrix back to connectom df
matrix2connectom <- function(mat, connectome, col_name) {
  connectome$temp = mat[connectome$index]
  connectome <- rename(connectome, !!col_name := temp)
  return(connectome)
}

# make the matrix symmetric
make_symmetric <- function(m) {
  # lower.tri is 0.0
  m[lower.tri(m)] <- t(m)[lower.tri(m)]
  return(m)
}

power_atals <- power2011 %>% 
  rename(ROI.Name = ROI, x.mni=X, y.mni=Y, z.mni=Z, network=NetworkName) %>% 
  mutate(ROI.Name=as.integer(ROI.Name), index = as.integer(ROI.Name),
         x.mni=as.integer(x.mni), y.mni=as.integer(y.mni), z.mni=as.numeric(z.mni)) %>%
  dplyr::select(ROI.Name, x.mni, y.mni, z.mni, network, index)

check_atlas(power_atals)
NESTED <- FALSE

if(NESTED) {
  connectome_data <- connectome_nested
} else {
  connectome_data <- connectome
}

Wconnectome <- connectome_data %>%
  mutate(A = A.vec[connectome_data$index], W = A*Beta)
  #separate(connection, c("connection1", "connection2"))%>%
  #separate(network, sep = "-", c("network1", "network2"), remove = F) %>%
  #filter(str_detect(network, pattern = "-1-")) %>%
         #network1 = ifelse(str_detect(network, pattern = "-1-"), -1, network1)) %>%
  #mutate(connection_type = ifelse(network1==network2, "Within", "Between"))

if (!SKIP) {
  write_csv(Wconnectome, file="./__cache__/strategy_mr.csv")
}


W_mat <- matrix(0, ncol = 264, nrow = 264)
W_mat[Wconnectome$index] <- Wconnectome$W
W_mat <- make_symmetric(W_mat) #CHECKED correct W_mat
## TEST CODE
Wconnectome %>% filter(W>0) %>%
  group_by(connection_type) %>%
  count()

Wconnectome %>% filter(W>0 & connection_type=="Between") %>% arrange(-W)
W_mat[64071]

print_mat_colrow_names <- function(mdat, ind){
  k <- arrayInd(ind, dim(mdat))
  print(paste("rowname: ", rownames(mdat)[k[,1]]))
  print(paste("colname: ", colnames(mdat)[k[,2]]))
}

print_mat_colrow_names(W_mat, 64071)
Wconnectome %>% 
  mutate(W_sign = ifelse(W >0, "+", "-")) %>%
  ggdotchart(x = "network_names", y = "W",
           color = "W_sign",                                # Color by groups
           palette = c("steelblue", "tomato"), # Custom color palette
           rotate = TRUE,
           facet.by = "connection_type", 
           sort.by.groups = F,
           sort.val = "desc",          # Sort the value in descending order
           sorting = "descending",                       # Sort value in descending order
           add = "segments",                             # Add segments from y = 0 to dots
           add.params = list(color = "lightgray", size = 2), # Change segment color and size
           group = "connection_type",                                # Order by groups
           dot.size = 3,                                 # Large dot size
           #label = round(connectome$Beta,2),                        # Add mpg values as dot labels
           #font.label = list(color = "white", size = 9,
           #                  vjust = 0.5),               # Adjust label parameters
           #group = "cyl",
           #y.text.col = TRUE,
           title = paste("Lasso Connection W:", dim(connectome)[[1]]),
           ggtheme = theme_pander()) +
  geom_hline(yintercept = 0, linetype = 2, color = "black") 

lwd_mat = matrix(1, nrow = nrow(W_mat), ncol = ncol(W_mat))
rownames(lwd_mat) = rownames(W_mat)
colnames(lwd_mat) = colnames(W_mat)
lwd_mat[W_mat > 0] = 2

border_mat = matrix(NA, nrow = nrow(W_mat), ncol = ncol(W_mat))
rownames(border_mat) = rownames(W_mat)
colnames(border_mat) = colnames(W_mat)
border_mat[W_mat > 0] = "black"
border_mat[W_mat < 0] = "gray"

chordDiagram(W_mat, link.lwd = lwd_mat, link.border = border_mat, scale = T, reduce = F)
circos.clear()
rownames(W_mat) = power_atals$network
colnames(W_mat) = power_atals$network

#col_fun = colorRamp2(range(W_mat), c("tomato", "steelblue"))


#sorted_net <- sort(power_atals$network)
#colors14 = c(brewer.pal(name="Set3", n = 12), brewer.pal(name="Set1", n = 9))[1:14]
#sorted_col <- c(brewer.pal(name="Set3", n = 12), brewer.pal(name="Set1", n = 9))[1:14]
              #c(brewer.pal(name="Set2", n = 8), 
              #  "darksalmon", "limegreen", "ligghtpink", "olivedrab", "navy", "mediumpurple", "maroon")
#rand_color(14)#c(brewer.pal(name="Set1", n = 9), brewer.pal(name="Set2", n = 9))[1:14]

png(filename = "./__cache__/figures1.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)
chordDiagram(W_mat, directional = FALSE, transparency = 0.5, self.link = 2  , symmetric = TRUE, scale = F, reduce = F,
             annotationTrack = c("grid", "axis"), #annotationTrackHeight = mm_h(c(8, 5)),
             grid.col = unique(power2011$Color), col = ifelse(W_mat>0, "tomato2", "#00000000"))
title("Declarative Network")

# Legend making
#legend("right",pch=20,legend=unique(colnames(W_mat)),
#       col=colors[unique(colnames(W_mat))],bty="n",
#       cex=1,pt.cex=3,border="black") # Set legend
circos.clear()
png(filename = "./__cache__/figures2.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)
chordDiagram(W_mat, directional = FALSE, transparency = 0.5, self.link = 2  , symmetric = TRUE, scale = F, reduce = F,
             annotationTrack = c("grid", "axis"),
             grid.col = unique(power2011$Color), col = ifelse(W_mat<0, "steelblue", "#00000000"))
title("Procedural Network")
circos.clear()
chordDiagram(x=W_mat, directional = FALSE, transparency = .5, self.link = 2, symmetric = TRUE, scale = T, reduce = F,
             grid.col = unique(power2011$Color), col = ifelse(W_mat>0, "tomato2", "steelblue"))
title("Network connections")

circos.clear()

##Network and node desciptives

Next, we will look at Graph properties of two networks

iGraph: Density

The proportion of present edges from all possible edges in the network.

# select cols
roi_links <- Wconnectome %>% dplyr::select(connection1, connection2, W, connection_type, network, network_names)
# rename cols
colnames(roi_links) <- c("from", "to", "weight", "connection_type", "network", "network_names")

roi_nodes <- power2011 %>% rename(id = ROI) %>%
  mutate(NetworkName=factor(NetworkName), 
         Color=factor(Color))
levels(roi_nodes$Color) <- sample(colors(T), 14) 

# create a graph
net <- graph_from_data_frame(d=roi_links, vertices=roi_nodes, directed=F) 
#g <- graph_from_adjacency_matrix(W_mat,, mode = "upper")

net.d <- net - E(net)[E(net)$weight<0]
net.p <- net - E(net)[E(net)$weight>0]

df.density <- data.frame("edge_density"=c(edge_density(net.d, loops=F), 
                            edge_density(net.p, loops=F),
                            edge_density(net, loops=F)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full")))
df.density %>% ggbarplot(x="network", y="edge_density", color = "white", fill=c("tomato", "steelblue", "gray"), width = 0.5, label = T, lab.nb.digits = 5) +
  theme_pander() + 
  ggtitle("Degree Edge Density")

iGraph: Diameter

A network diameter is the longest geodesic distance (length of the shortest path between two nodes) in the network. In igraph, diameter() returns the distance, while get_diameter() returns the nodes along the first found path of that distance.

# make negative weights to positive
net.p.abs <- net.p
E(net.p.abs)$weight <- E(net.p.abs)$weight * (-1)

# make negative weights to positive
net.abs <- net
E(net.abs)$weight[E(net.abs)$weight<0] <- E(net.abs)$weight[E(net.abs)$weight<0] * (-1)


df.diameter <- data.frame("diameter"=c(diameter(net.d, directed=F), 
                        diameter(net.p.abs, directed=F),
                        diameter(net.abs, directed=T)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full")))
df.diameter %>% ggbarplot(x="network", y="diameter", color = "white", fill=c("tomato", "steelblue", "gray"), width = 0.5, label = T, lab.nb.digits = 5) +
  theme_pander() + 
  ggtitle("Network Diameter")

#### iGraph: Centrality Degree

Centrality functions (vertex level) and centralization functions (graph level). The centralization functions return res - vertex centrality, centralization, and theoretical_max - maximum centralization score for a graph of that size. The centrality function can run on a subset of nodes (set with the vids parameter). This is helpful for large graphs where calculating all centralities may be a resource-intensive and time-consuming task.

Centrality is a general term that relates to measures of a node’s position in the network. There are many such centrality measures, and it can be a daunting task to wade through all of the different ways to measure a node’s importance in the network. Here, we will introduce just a few examples.

df.centrality <- data.frame("centr_degree"=c(centr_degree(net.d, normalized=T)$centralization, 
                        centr_degree(net.p, normalized=T)$centralization,
                        centr_degree(net, normalized=T)$centralization), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full")))
df.centrality %>% ggbarplot(x="network", y="centr_degree", color = "white", fill=c("tomato", "steelblue", "gray"), width = 0.5, label = T, lab.nb.digits = 5) +
  theme_pander() + 
  ggtitle("Network Degree centrality ")

iGraph: Betweeness (Closeness)

Let’s now do the same for betweenness centrality, which is defined as the number of geodesic paths (shortest paths) that go through a given node. Nodes with high betweenness might be influential in a network if, for example, they capture the most amount of information flowing through the network because the information tends to flow through them. Here, we use the normalized version of betweenness.

Closeness (centrality based on distance to others in the graph) Inverse of the node’s average geodesic distance to others in the network.

df.sloseness <- data.frame("centr_clo"=c(centr_clo(net.d)$centralization, 
                        centr_clo(net.p)$centralization,
                        centr_clo(net)$centralization), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full")))
df.sloseness %>% ggbarplot(x="network", y="centr_clo", color = "white", fill=c("tomato", "steelblue", "gray"), width = 0.5, label = T, lab.nb.digits = 5) +
  theme_pander() + 
  ggtitle("Network Closeness")

iGraph: Distances

df.distance <- data.frame("mean_distance"=c(mean_distance(net.d, directed=F), 
                        mean_distance(net.p, directed=F),
                        mean_distance(net, directed=F)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full")))
df.distance %>% ggbarplot(x="network", y="mean_distance", color = "white", fill=c("tomato", "steelblue", "gray"), width = 0.5, label = T, lab.nb.digits = 5) +
  theme_pander() + 
  ggtitle("Network Closeness")

iGraph: Assortativity

df.assortativity<- data.frame("assortativity_degree"=c(assortativity_degree(net.d, directed=F), 
                        assortativity_degree(net.p, directed=F),
                        assortativity_degree(net, directed=F)), 
           "network" = factor(c("Declarative", "Procedural", "Full"), levels = c("Declarative", "Procedural", "Full"))) 
df.assortativity %>% ggbarplot(x="network", y="assortativity_degree", color = "white", fill=c("tomato", "steelblue", "gray"), width = 0.5, label = T, lab.nb.digits = 5) +
  theme_pander() + 
  ggtitle("Network Assortativity Degree")

W in Brain connection

#colors <- factor(power_atals$network)
#levels(colors) <- colors14
#power_atals$colors <- as.character(temp)
check_atlas(power_atals)
x1 <- W_mat
x1[x1<0] <- 0


p1 <- brainconn(atlas=power_atals, conmat=x1, node.color = power2011$Color, view = "top",
          node.size = igraph::degree(net.d)*2.5, all.nodes = TRUE, 
          edge.color = "tomato",  edge.color.weighted = FALSE, scale.edge.width = c(1,3), edge.alpha = 0.6,
          label.edge.weight = F,  show.legend = F,
          background.alpha = .3) #+ ggtitle("Strategy Predictability: W")
p2 <- brainconn(atlas=power_atals, conmat=x1, node.color = power2011$Color, view = "left",
          node.size = igraph::degree(net.d)*2.5, all.nodes = TRUE, 
          edge.color = "tomato",  edge.color.weighted = FALSE, scale.edge.width = c(1,3), edge.alpha = 0.6,
          label.edge.weight = F,  show.legend = F,
          background.alpha = .3) #+ ggtitle("Strategy Predictability: W")
p3 <- brainconn(atlas=power_atals, conmat=x1, node.color = power2011$Color, view = "back",
          node.size = igraph::degree(net.d)*2.5, all.nodes = TRUE, 
          edge.color = "tomato",  edge.color.weighted = FALSE, scale.edge.width = c(1,3), edge.alpha = 0.6,
          label.edge.weight = F,  show.legend = F,
          background.alpha = .3) #+ ggtitle("Strategy Predictability: W")

x2 <- W_mat
x2[x2>0] <- 0

p4 <- brainconn(atlas=power_atals, conmat=x2*-10, node.color = power2011$Color, view = "top",
          node.size = igraph::degree(net.p)*2.5, all.nodes = TRUE, 
          edge.color = "steelblue",  edge.color.weighted = FALSE, 
          scale.edge.width = c(1,3), edge.alpha = 0.6,
          label.edge.weight = F,  show.legend = F,
          background.alpha = .3)  



p5 <- brainconn(atlas=power_atals, conmat=x2*-10, node.color = power2011$Color, view = "left",
          node.size = igraph::degree(net.p)*2.5, all.nodes = TRUE, 
          edge.color = "steelblue",  edge.color.weighted = FALSE, 
          scale.edge.width = c(1,3), edge.alpha = 0.6,
          label.edge.weight = F,  show.legend = F,
          background.alpha = .3) 

p6 <- brainconn(atlas=power_atals, conmat=x2*-10, node.color = power2011$Color, view = "back",
          node.size = igraph::degree(net.p)*2.5, all.nodes = TRUE, 
          edge.color = "steelblue",  edge.color.weighted = FALSE, 
          scale.edge.width = c(1,3), edge.alpha = 0.6,
          label.edge.weight = F,  show.legend = F,
          background.alpha = .3) 
png(filename = "./__cache__/figures4.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)
p1
png(filename = "./__cache__/figures5.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)
p2
png(filename = "./__cache__/figures6.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)
p3
png(filename = "./__cache__/figures7.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)
p4
png(filename = "./__cache__/figures8.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)

p5
png(filename = "./__cache__/figures9.png",  bg = "transparent", width =500, height = 500, units = "px", res = 150)

p6
x1[,] <-1
brainconn(atlas=power_atals, conmat=x1, node.color = power2011$Color, view = "top",
          node.size = 2, all.nodes = TRUE, 
          edge.color = "tomato",  edge.color.weighted = FALSE, scale.edge.width = c(1,3), edge.alpha = 0.6,
          
          background.alpha = .3)

# Add a legend
legend(1, 95, legend=power_atals$network, 
       col=power2011$Color, lty=1:2, cex=0.8)

Distribution of connections

Look at the distribution of network in two groups

DUPLICATE <- TRUE

c1 <- Wconnectome %>% filter(W>0) %>% mutate(roi = as.integer(connection1)) %>% dplyr::select(roi) %>%  unlist()
c2 <- Wconnectome %>% filter(W>0) %>% mutate(roi = as.integer(connection2)) %>% dplyr::select(roi) %>%  unlist()


c3 <- Wconnectome %>% filter(W<0) %>% mutate(roi = as.integer(connection1)) %>% dplyr::select(roi) %>%  unlist()
c4 <- Wconnectome %>% filter(W<0) %>% mutate(roi = as.integer(connection2)) %>% dplyr::select(roi) %>%  unlist()


if (DUPLICATE) {
  c12 <- c(c1, c2)
  c34 <- c(c3, c4)
} else {
  c12 <- unique(c(c1, c2))
  c34 <- unique(c(c3, c4))
}

df.c1 <- power2011[c12,] %>%
  mutate(NetworkName = factor(NetworkName)) %>%
  count(NetworkName, name = "count", .drop = F)%>%
  right_join(power2011 %>% dplyr::select(NetworkName, Color) %>% distinct(), on="NetworkName") %>%
  mutate(count=as.integer(ifelse(is.na(count), 0, count))) %>%
  arrange(NetworkName)


# df.c1 %>%
#   ggplot(aes(x = count, y=NetworkName)) +
#   geom_col(aes(fill = NetworkName)) +
#   scale_fill_manual(values=colors14, guide = guide_legend(reverse = T)) +
#   #scale_x_reverse(limits = c(8,0)) +
#   theme_pander() + 
#   ggtitle("Declarative network", subtitle = "Distribution of connections") +
#   theme(legend.position = "right", axis.text.y = element_blank(), axis.title.y = element_blank(),
#         plot.title = element_text(size = 20),
#         axis.text = element_text(size = 20),
#         legend.title = element_text(size = 20),
#         legend.text = element_text(size = 20)) 

p7 <- ggbarplot(df.c1, x="NetworkName", y="count", fill = "NetworkName", color="white",
          palette = df.c1$Color, #order = "count", 
          rotate = TRUE, ggtheme = theme_pander(), position = position_dodge(preserve = "single"),
          title = "Distribution of connections") +
      scale_y_reverse(limits=c(14,0), breaks=c(0,5, 10, 15)) +
      #scale_fill_manual(values = sort(df.c1$NetworkName, decreasing = T)) +
      theme(legend.position = "right", axis.text.y = element_blank(), axis.title.y = element_blank(),
        plot.title = element_text(size = 20),
        axis.text = element_text(size = 20),
        legend.title = element_text(size = 20),
        legend.text = element_text(size = 20))
  
p7

# count number of networks included in
df.c2 <- power2011[c34,] %>%
  mutate(NetworkName = factor(NetworkName)) %>%
  count(NetworkName, name = "count", .drop = F) %>%
  right_join(power2011 %>% dplyr::select(NetworkName, Color) %>% distinct(), on="NetworkName") %>%
  mutate(count=ifelse(is.na(count), 0, count)) %>%
  arrange(NetworkName) 


# df.c2 %>%
#   ggplot(aes(x = count, y=NetworkName)) +
#   geom_col(aes(fill = NetworkName)) +
#   scale_fill_manual(values=colors14, guide = guide_legend(reverse = T)) +
#   theme_pander() + 
#   ggtitle("Procedural network", subtitle = "Distribution of connections") +
#   theme(legend.position = "right", axis.text.y = element_blank(), axis.title.y = element_blank(),
#         plot.title = element_text(size = 20),
#         axis.text = element_text(size = 20),
#         legend.title = element_text(size = 20),
#         legend.text = element_text(size = 20)) 


p8 <- ggbarplot(df.c2, x="NetworkName", y="count", fill = "NetworkName", color="white",
          palette = df.c1$Color, #order = "count", 
          rotate = TRUE, ggtheme = theme_pander(), 
          title = "Distribution of connections") +
  #scale_y_reverse() +
  scale_y_continuous(limits = c(0,15), breaks=c(0,5, 10, 15)) +
    theme(legend.position = "right", axis.text.y = element_blank(), axis.title.y = element_blank(),
        plot.title = element_text(size = 20),
        axis.text = element_text(size = 20),
        legend.title = element_text(size = 20),
        legend.text = element_text(size = 20)) 

p8

#png("./figures/network_distributions.png", bg = "white", width =3600, height = 1000, units = "px", res = 150)
png(filename = "./__cache__/figures3.png",  bg = "transparent", width =3600, height = 1000, units = "px", res = 150)
ggarrange(p7, NULL, NULL, p8, 
          #labels = c("A", "B", "C"),
          ncol = 4, nrow = 1, align = "h", 
          common.legend = TRUE, legend = "bottom", widths = c(1,.8,.8,1))

Chi-sq Test

chisq.test(df.c1$count, df.c2$count, simulate.p.value = T, p = noi_stats$N)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  df.c1$count and df.c2$count
## X-squared = 67.083, df = NA, p-value = 0.01999